{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ca7a8b54-ae51-4036-8ae4-6886e09e3692",
   "metadata": {},
   "source": [
    "- 使用overlap： 这里不是指的滑动窗口切块的过程中有一定窗口重叠，而是召回的时候会去检索前后的文档块，llamaindex中有比较方便的实现。\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "028d7dbb-43a3-4077-a0cd-2e555fa91553",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import Settings\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
    "from llama_index.core.readers import SimpleDirectoryReader\n",
    "from llama_index.vector_stores.faiss import FaissVectorStore\n",
    "from llama_index.core.ingestion import IngestionPipeline\n",
    "from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter\n",
    "from llama_index.core import VectorStoreIndex\n",
    "from llama_index.core.postprocessor import MetadataReplacementPostProcessor\n",
    "import faiss\n",
    "import os\n",
    "import sys\n",
    "from dotenv import load_dotenv\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "024b334a-5ef4-4531-9223-eeccbaf50da6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\python311\\Lib\\site-packages\\pydantic\\_internal\\_fields.py:161: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\n",
      "\n",
      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core.llms import (\n",
    "    CustomLLM,\n",
    "    CompletionResponse,\n",
    "    CompletionResponseGen,\n",
    "    LLMMetadata,\n",
    ")\n",
    "from llama_index.core.llms.callbacks import llm_completion_callback\n",
    "from openai import OpenAI\n",
    "from typing import Optional, List, Mapping, Any\n",
    "\n",
    "class OurLLM(CustomLLM):\n",
    "    context_window: int = 10000\n",
    "    num_output: int = 512\n",
    "    model_name: str = \"glm-4-plus\"\n",
    "    llm:str =  \"\"\n",
    "    \n",
    "    @property\n",
    "    def metadata(self) -> LLMMetadata:\n",
    "        \"\"\"Get LLM metadata.\"\"\"\n",
    "        return LLMMetadata(\n",
    "            context_window=self.context_window,\n",
    "            num_output=self.num_output,\n",
    "            model_name=self.model_name,\n",
    "        )\n",
    "\n",
    "    @llm_completion_callback()\n",
    "    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:\n",
    "        self.llm = OpenAI(base_url=\"https://open.bigmodel.cn/api/paas/v4\",api_key=\"f97b28aa71892d6d515007ef44ee5de6.WvLXD1MsFsGklOZa\")\n",
    "        completion = self.llm.chat.completions.create(model=\"glm-4-plus\",temperature=0.0,messages=[{\"role\": \"user\", \"content\": prompt}])\n",
    "        return CompletionResponse(text=completion.choices[0].message.content)\n",
    "\n",
    "    @llm_completion_callback()\n",
    "    def stream_complete(\n",
    "        self, prompt: str, **kwargs: Any\n",
    "    ) -> CompletionResponseGen:\n",
    "        response = \"\"\n",
    "        self.llm = OpenAI(base_url=\"https://open.bigmodel.cn/api/paas/v4\",api_key=\"f97b28aa71892d6d515007ef44ee5de6.WvLXD1MsFsGklOZa\")\n",
    "        completion = self.llm.chat.completions.create(model=\"glm-4-plus\",temperature=0.0,messages=[{\"role\": \"user\", \"content\": prompt}],\n",
    "                                                     stream=True)\n",
    "        for token in completion:\n",
    "            delta = token.choices[0].delta.content\n",
    "            response += delta\n",
    "            yield CompletionResponse(text=response, delta=delta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b0084831-f522-4662-8b0e-69198ae1bea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "EMBED_DIMENSION=512\n",
    "Settings.llm = OurLLM()\n",
    "Settings.embed_model = HuggingFaceEmbedding(model_name=\"../../bge-small-zh-v1.5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a327d9a4-346d-4105-ad60-40692fc41e5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core.schema import Document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3e4ad0b3-9dba-437a-885c-720c8e92a1fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"蔚来萤火虫即将交付，李斌所预测的两年内新能源车渗透率超80%能否实现？\n",
    "\n",
    "9 月 6 日消息，近日，蔚来汽车创始人、董事长李斌在蔚来九周年内部讲话及财报电话会上透露了多项重要信息，其中最引人注目的莫过于第三品牌萤火虫将于 2025 年正式交付，并预测新能源汽车的市场渗透率将在未来两年内超过 80%。\n",
    "\n",
    "据李斌介绍，蔚来汽车将形成三个品牌矩阵，覆盖从 14 万元到 80 万元的广阔市场区间。其中，第三品牌萤火虫作为蔚来汽车布局中低端市场的重要棋子，将于 2025 年正式交付。这一举措不仅丰富了蔚来的产品线，也进一步提升了其在新能源汽车市场的竞争力。\n",
    "\n",
    "在谈及新能源汽车市场渗透率时，李斌表示出极大的信心。他指出，当前新能源汽车的市场渗透率已经超过 50%，并预计在未来两年内将突破 80% 的大关。这一预测基于多个因素的综合考量，包括新能源汽车价格的持续下降、性能的不断提升以及用户购买力的增强等。随着新能源汽车技术的不断成熟和充电基础设施的日益完善，新能源汽车的普及速度将进一步加快，对传统燃油车的替代效应也将愈发明显。\n",
    "\n",
    "李斌还透露，蔚来汽车正在为 2025 年至 2026 年的全线产品做准备。F2 工厂已经开始双班生产，并计划在 9 月底到 10 月达到双班生产能力。这一举措将显著提升蔚来汽车的产能，为萤火虫等新品牌的交付提供有力保障。\n",
    "\n",
    "此外，李斌还强调了换电业务的可持续经营和换电联盟的建立。他表示，换电是蔚来汽车巨大的先发优势，目前已经到了面向全行业开放的时刻。蔚来汽车将与多家企业合作，共同推动换电业务的发展，提升新能源汽车的使用便利性和用户体验。\n",
    "\n",
    "在谈及市场竞争时，李斌表示，未来几年新能源汽车市场的竞争强度只会增加不会下降。他呼吁全体员工要提升效率、狠抓执行，确保蔚来汽车在未来的竞争中保持领先地位。同时，他也表示蔚来汽车有足够的信心跨越资格赛最惨烈的赛段，赢得决赛的资格。\n",
    "蔚来汽车第三品牌萤火虫的即将交付以及新能源汽车市场渗透率的快速提升，无疑为整个新能源汽车行业带来了新的发展机遇和挑战。随着新能源汽车技术的不断进步和市场的日益成熟，我们有理由相信新能源汽车的普及速度将进一步加快，为汽车产业的转型升级和可持续发展注入新的动力。\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "51e3032a-3452-4831-90f5-925fb06ac73e",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = [Document(text=text,metadata={'file_name':'test.txt'})]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "42db5c08-2d4d-4a27-a336-bdfec1d8e817",
   "metadata": {},
   "outputs": [],
   "source": [
    "faiss_index = faiss.IndexFlatL2(512)\n",
    "vector_store = FaissVectorStore(faiss_index=faiss_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3fdf942d-c319-4dac-a92e-75b99cd3d185",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_pipeline = IngestionPipeline(\n",
    "    transformations=[SentenceSplitter(chunk_size=64,chunk_overlap=12)],\n",
    "    vector_store=vector_store\n",
    ")\n",
    "\n",
    "base_nodes = base_pipeline.run(documents=documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6fc31e0c-6109-4aa4-a3ae-6f8020e8b06d",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_parser = SentenceWindowNodeParser(\n",
    "    window_size=3,\n",
    "    window_metadata_key=\"window\",\n",
    "    original_text_metadata_key=\"original_sentence\"\n",
    ")\n",
    "\n",
    "pipeline = IngestionPipeline(\n",
    "    transformations=[node_parser],\n",
    "    vector_store=vector_store,\n",
    ")\n",
    "\n",
    "windowed_nodes = pipeline.run(documents=documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "73b950e3-5456-4032-8ca6-43ff3537ca49",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"蔚来萤火虫主打什么市场？\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c72bf8f6-9a82-4da8-89fe-a676949cce38",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_index = VectorStoreIndex(base_nodes)\n",
    "\n",
    "base_query_engine = base_index.as_query_engine(\n",
    "    similarity_top_k=1,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e2054269-17b2-436a-96f6-f7fa4504feab",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "根据提供的上下文信息，蔚来汽车第三品牌萤火虫即将交付，并且提到新能源汽车市场渗透率的快速提升。虽然没有直接说明蔚来萤火虫主打的具体市场，但可以推测：\n",
      "\n",
      "**蔚来萤火虫主打的是新能源汽车市场。**\n",
      "\n",
      "这个推测基于以下几点：\n",
      "1. **品牌定位**：作为蔚来汽车的第三品牌，萤火虫很可能继承和延续蔚来在新能源汽车领域的定位。\n",
      "2. **市场趋势**：文中提到新能源汽车市场渗透率的快速提升，暗示萤火虫的推出是为了抓住这一市场机遇。\n",
      "\n",
      "如果需要更具体的市场细分（如高端市场、大众市场等），则需要更多详细信息。\n"
     ]
    }
   ],
   "source": [
    "base_index = VectorStoreIndex(base_nodes)\n",
    "\n",
    "base_query_engine = base_index.as_query_engine(\n",
    "    similarity_top_k=1,\n",
    ")\n",
    "\n",
    "base_response = base_query_engine.query(query)\n",
    "\n",
    "print(base_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "d59b60e1-f0da-4429-bff9-fa7ccfab4cb1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'蔚来汽车第三品牌萤火虫的即将交付以及新能源汽车市场渗透率的快速提升，无疑为整个新能源汽车行业'\n"
     ]
    }
   ],
   "source": [
    "pprint(base_response.source_nodes[0].node.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "df561b93-e698-4759-bafb-83cdc9f6d07e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\python311\\Lib\\site-packages\\llama_index\\core\\schema.py:79: UserWarning: Pydantic serializer warnings:\n",
      "  Expected `str` but got `OpenAI` - serialized value may not be as expected\n",
      "  data = handler(self)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "蔚来萤火虫主打中低端市场。根据提供的文本信息，蔚来汽车将形成三个品牌矩阵，覆盖从14万元到80万元的广阔市场区间，其中第三品牌萤火虫作为蔚来汽车布局中低端市场的重要棋子，将于2025年正式交付。这一举措旨在丰富蔚来的产品线，并进一步提升其在新能源汽车市场的竞争力。\n"
     ]
    }
   ],
   "source": [
    "windowed_index = VectorStoreIndex(windowed_nodes)\n",
    "windowed_query_engine = windowed_index.as_query_engine(\n",
    "    similarity_top_k=1,\n",
    "    node_postprocessors=[\n",
    "        MetadataReplacementPostProcessor(\n",
    "            target_metadata_key=\"window\"\n",
    "            )\n",
    "        ],\n",
    ")\n",
    "\n",
    "windowed_response = windowed_query_engine.query(query)\n",
    "print(windowed_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e673c5a0-d182-43aa-a003-007ffaf07d1c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9dc77a12-d195-4aed-b405-8a77f1cca200",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0rc2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
